Here I use posterior means:
par(mfrow=c(1,2))
hist(as.vector(unlist(eznorm)),main="E(Z|Data)/E(Z|Data)[max(abs(E(Z|Data))]",freq=F,xlab="E(Z|Data)/E(Z|Data)[max(abs(E(Z|Data))]",ylim=c(0,2))
hist(as.vector(unlist(znorm)),main="Z/Z[max(abs(Z)]",freq=F,xlab="Z/Z[max(abs(Z)]",ylim=c(0,2))
Now, we might want to plot a biplot of these values vs the value used to normalize it:
R=ncol(posterior.means)
maxes=t(apply(posterior.means,1,function(x){
rep(x[which.max(abs(x))],R)
}))
png("normstuff.png")
par(mfrow=c(1,2))
plot(eznorm,maxes,main="NormalizingZvsNormalized_PM",xlab="E(Z|data)_norm",ylab="NormalizingValue")
plot(znorm,maxes,main="NormalizingZvsNormalized_Zmle",xlab="Z_norm",ylab="NormalizingValue")
het.sign=function(effectsize){
t(apply(effectsize,1,function(x){
x/sign(x[which.max(abs(x))])
}))}
ezsign=het.sign(effectsize = posterior.means)
zsign=het.sign(effectsize = z.stat)
par(mfrow=c(1,2))
hist(as.vector(unlist(ezsign)),main="E(Z|Data)/sign(E(Z|Data)[max(abs(E(Z|Data))])",freq=F,xlab="E(Z|Data)/sign(E(Z|Data)[max(abs(E(Z|Data))])",ylim=c(0,0.4))
hist(as.vector(unlist(zsign)),main="Z/Z[max(abs(Z)]",freq=F,xlab="Z/sign(Z[max(abs(Z)])",ylim=c(0,0.4))
Thresholding:
Pull out tissue-specifics; examine examples.
Here’s an example:
j=which(rownames(z.stat)=="ENSG00000244171.3_3_142895174_T_G_b37")
barplot(as.matrix(maxz[j,]),las=2,cex.names=0.5,main=paste0("Z Statistics",rownames(maxz)[j]))
barplot(as.numeric(posterior.means[j,]),main=paste0("E(Z|EZ)",rownames(maxz)[j]),col=col.func(j,lfsr=lfsr,posterior.means=posterior.means),las=2,cex.names=0.8,ylab="PosteriorMean")
Let’s examine ENSG00000244171.3_3_142895174_T_G_b37 do discover his function:
pre-B-cell leukemia homeobox 2 pseudogene 1 [Source:HGNC Symbol;Acc:HGNC:8635]
We can find all the ones who have high loading on the ‘tissue specific’ configs:
pw.singles=post.weights[,sapply(covmat,function(x){sum(diag(x)!=0)==1})]
singleton.rank=rowSums(pw.singles)
j=which.max(singleton.rank)
par(mfrow=c(1,2))
barplot(as.matrix(maxz[j,]),las=2,cex.names=0.5,main=paste0("Z Statistics",rownames(maxz)[j]))
barplot(as.numeric(posterior.means[j,]),main=paste0("E(Z|EZ)",rownames(maxz)[j]),col=col.func(j,lfsr=lfsr,posterior.means=posterior.means),las=2,cex.names=0.8,ylab="PosteriorMean")
This particular gene is Desmoplakin, known to be involved in dilated cardiomyopathy.
pw.all=post.weights[,sapply(covmat,function(x){sum(diag(x)/max(diag(x)))/sum(x[1,2]!=0)==44})]
cons.rank=rowSums(pw.all[,6:22])
j=which.max(cons.rank)
par(mfrow=c(1,2))
barplot(as.matrix(maxz[j,]),las=2,cex.names=0.5,main=paste0("Z Statistics",rownames(maxz)[j]))
barplot(as.numeric(posterior.means[j,]),main=paste0("E(Z|EZ)",rownames(maxz)[j]),col=col.func(j,lfsr=lfsr,posterior.means=posterior.means),las=2,cex.names=0.8,ylab="PosteriorMean")
For each of your sampled beta vectors, normalize beta_t by its “largest” element (where largest means largest in absolute value, but you normalize by the actual value, not the absolute value, so that the normalized vector has maximum element +1). Then just plot a histogram of the resulting normalized beta values (pooled across all the samples). We should see very strong negative values are rare?
hetaugust=readRDS("~/Dropbox/Aug12/simulatedmvn.rds")
max.val=readRDS("~/Dropbox/Aug12/simulatedmaxval.rds")
par(mfrow=c(1,2))
hist(unlist(as.vector(hetaugust)),main="Simulated_Z/Z[,which.max(abs(Z))]",xlab="Znorms_simulatedfromPosterior",freq=F,ylim=c(0,1.5))
hist(unlist(as.vector(znorm)),main="Z/Z[,which.max(abs(Z))]",xlab="Znorm",freq=F,ylim=c(0,1.5))
Here, we divide by the maximum sign value.
hetaugust=readRDS("~/Dropbox/Aug12/simulatedmvnsign.rds")
par(mfrow=c(1,2))
hist(unlist(as.vector(hetaugust)),main="Simulated_Z/Z[,which.max(abs(Z))]",xlab="Znorms_simulatedfromPosterior",freq=F,ylim=c(0,.5))
hist(unlist(as.vector(zsign)),main="Z/sign(Z[,which.max(abs(Z))])",xlab="Znorm",freq=F,ylim=c(0,0.5))
Biplots of simulated values:
rep.col<-function(x,n){
matrix(rep(x,each=n), ncol=n, byrow=TRUE)
}
bi_plot_funct=function(sim,max.val,sim.array){
j=sim
mat.col=max.val[,j]
a=rep.col(max.val[,j],44)
plot(sim.array[,j,],a,main="MaxValuevsSimulatedNormalizedValue",xlab="SimulatedArrayDividedbyMaxVal",ylab="MaximumValue")
}
bi_plot_funcmaxz=function(max.val,znorm){
a=rep.col(max.val,44)
plot(znorm,a,main="MaxValuevsSimulatedNormalizedValue",xlab="SimulatedArrayDividedbyMaxVal",ylab="MaximumValue")
}
For each simulation, we can make a biplot against the rvalue used to normalize the vector (the maxval).
hetsign=readRDS("~/Dropbox/Aug12/simulatedmvnsign.rds")
hetval=readRDS("~/Dropbox/Aug12/simulatedmvn.rds")
png("simulatedvsmax.png")
par(mfrow=c(1,2))
bi_plot_funct(sim=5,max.val = max.val,sim.array=hetsign)
bi_plot_funct(sim=5,max.val = max.val,sim.array=hetval)
dev.off()
z.max=apply(z.stat,1,function(x){x[which.max(abs(x))]})
png("mlevsmax.png")
par(mfrow=c(1,2))
bi_plot_funcmaxz(max.val=z.max,znorm = zsign)
bi_plot_funcmaxz(max.val=z.max,znorm = znorm)
dev.off()
Understanding patterns of sharing:
We can see that the majority of weight is on the non-single rank \(U_ks\). In fact, the third \(U_k\) which corresponds to the rank 3.
barplot(diag(covmat[[num.mat[22,3]]])/max(diag(covmat[[num.mat[22,3]]])),main="Prior Effect Size Sigma per Tissue of Most Common Covariance Matrix",names=colnames(z.stat),las=2,cex.names=0.5)
Even though the sqrt(maximum\(\omega\))is only 7, this captures the vast majority (over 95%) of the data, if we follow the principle that the max(\(\omega\)) should be roughly \(se(z)\)^2+\(z^2\).
dir=
heterogeneity=read.table("../Data/het.tablewithED.txt")[,1]
hist(heterogeneity,main="Proportion of Simulations with H0 False",freq=FALSE,xlab="Proportion of Simulations with Sign Heterogeneity")
sum(heterogeneity)
## [1] 10337.48
h=which(heterogeneity>0.5)
c=which(heterogeneity<0.5)
length(h)
## [1] 10769
length(c)
## [1] 5211
summary(unlist(as.vector(lfsr[h,])))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00213 0.07410 0.14610 0.26570 0.75990
summary(unlist(as.vector(lfsr[c,])))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000000 0.0000000 0.0000841 0.1320000 0.0345400 1.0000000
par(mfrow=c(1,2))
hist(unlist(as.vector(lfsr[h,])),main=NULL,ylab=NULL,xlab=NULL,freq=FALSE,ylim=c(0,14))
title(main="LFSRs of Inconsistent genes", col.main="red",
col.sub="blue",
xlab="LFSR", ylab="Density",
col.lab="green", cex.lab=0.75,ylim=c(0,14))
hist(unlist(as.vector(lfsr[c,])),main=NULL,ylab=NULL,xlab=NULL,freq=FALSE)
title(main="LFSRs of Consistent genes", col.main="red",
col.sub="blue",
xlab="LFSR", ylab="Density",
col.lab="green", cex.lab=0.75)
But we can see that the lfsrs of the heterogeneous SNPs are consistency larger than the lfsrs of the consistent SNP, meaning that our model will alert us to their uncertainty.
To get an idea of the presence of tissue specifics, we can look at the prior weight on the tissue specific configuraitons.
thresh=0.05
dist=as.matrix(lfsr)<=thresh
ones=which(rowSums(dist)==1)
thresh=0.05
barplot(apply(lfsr[which(rowSums(lfsr<=thresh)==1),],2,function(x){sum(x<=thresh)}),las=2,cex.names=0.5,main=paste0("Number of eQTL with LFSR<",0.05," in Single Tissue"),ylim=c(0,420))
zthresh=5
barplot(apply(maxz[which(rowSums(maxz>zthresh)==1),],2,function(x){sum(x>zthresh)}),las=2,cex.names=0.5,main=paste0("Number of eQTL with Z>",zthresh,"in Single Tissue"))
singleton.weights=pis[sapply(covmat,function(x){sum(diag(x)!=0)==1})]
sum(colSums(pi.mat[,10:53]))
## [1] 0.005241691
We might say that only 0.0052417 percent of genes demonstrate tissue specificity.
And how about the ‘completely consistent’?
#singleton.weights=pis[sapply(covmat,function(x){sum(diag(x)!=0)==1})]
colSums(pi.mat)[54]
## [1] 0.02572196
Let’s understand qualitatively how matrix ash affects tissue specific effects:
First, we plot the ‘singletons’ of the ‘noisy zstats’
library(qtlcharts)
id=colnames(maxz)
iplotCurves(as.matrix(maxz)[ones,],1:44,chartOpts=list(xlab="Tissue Indices", ylab="Z_mle",pointcolor=c("slateblue")))
## Set screen size to height=700 x width=1000
#barplot(as.matrix(maxz)[ones,],las=2,colnames=colnames(maxz))
#barplot(as.matrix(posterior.means)[ones,],las=2,names=colnames(maxz))
Now, we plot the matrix-ash shrunken zstats. Look how clean!
iplotCurves(as.matrix(posterior.means[ones,]),1:44,chartOpts=list(xlab="Tissue Indices", ylab="E(Z|Data)",pointcolor=c("slateblue")))
Some tissue specific examples:
library("qtlcharts")
plot_ts("Testis",lfsr, z.stat,0.05)
plot_ts("Testis",lfsr, posterior.means,0.05)
plot_ts("Whole_Blood",lfsr, z.stat,0.05)
plot_ts("Whole_Blood",lfsr, posterior.means,0.05)
plot_ts("Muscle_Skeletal",lfsr,z.stat,0.05)
plot_ts("Liver",lfsr, z.stat,0.05)
plot_ts("Liver",lfsr, posterior.means,0.05)
plot_ts("Lung",lfsr, z.stat,0.05)
plot_ts("Lung",lfsr, posterior.means,0.05)
plot_ts("Lung",lfsr, z.stat,0.05)
plot_ts("Lung",lfsr, posterior.means,0.05)
plot_ts("Muscle_Skeletal" ,lfsr, z.stat,0.05)
plot_ts("Muscle_Skeletal" ,lfsr, posterior.means,0.05)
clean_names=sapply(rownames(maxz),function(x){ strsplit(x, "[.]")[[1]][1]})
liver=which(clean_names=="ENSG00000166923")
barfunc(genename = liver)
lung=which(clean_names=="ENSG00000151468")
barfunc(genename = lung)
testes=which(rownames(maxz)=="ENSG00000131848.5_19_56741808_G_A_b37")
##potassium channel subfamily M regulatory beta subunit 4
barfunc(genename = testes)
msk=(which(rownames(maxz)=="ENSG00000103316.6_16_21277438_C_T_b37"))
barfunc(genename = msk)
##crystallin
ms3=which(clean_names=="ENSG00000152601")
barfunc(genename = ms3)
##ENSG00000152601 muscleblind-like (Drosophila)
ms4=which(clean_names=="ENSG00000176658")
barfunc(genename = ms4)
##ENSG00000152601 muscleblind-like (Drosophila)
thyroid=(which(rownames(maxz)=="ENSG00000075275.12_22_46859003_G_C_b37"))
##cadherin, EGF LAG seven-pass G-type receptor 1
barfunc(genename = thyroid)
adrenal=(which(rownames(maxz)=="ENSG00000135643.4_12_70965479_A_G_b37"))
barfunc(genename = adrenal)
#potassium channel subfamily M regulatory beta subunit 4
wholebloodone=(which(rownames(maxz)=="ENSG00000156110.9_10_75928933_T_C_b37"))
barfunc(genename = wholebloodone)
#Adenosine kinase (HGNC Symbol)
#Entrez gene summary
#This gene an enzyme which catalyzes the transfer of the gamma-phosphate from ATP to adenosine, thereby serving as a regulator of concentrations of both extracellular adenosine and intracellular adenine nucleotides. Adenosine has widespread effects on the cardiovascular, nervous, respiratory, and immune systems and inhibitors of the enzyme could play an important pharmacological role in increasing intravascular adenosine concentrations and acting as anti-inflammatory agents. Multiple transcript variants encoding different isoforms have been found for this gene. [provided by RefSeq, Jan 2011]"
wholebloodtwo=(which(rownames(maxz)=="ENSG00000078114.14_10_21501314_T_C_b37"))
barfunc(genename = wholebloodtwo)
#Experiment name Transcription profiling of human blood taken from individuals fed an antioxidant-rich diet or a diet containing kiwi-fruit to evaluate the effect on gene expression with emphasis on stress and repair related processes [E-MEXP-2030]
# Source ArrayExpress
# Experiment description Plant-based diets rich in fruit and vegetables can prevent development of several chronic age-related diseases such as cancer and cardiovascular diseases. The mechanisms behind this protective effect is not elucidated. We have studied whether a specially designed antioxidant-rich food diet and a kiwi-fruit intervention can affect whole genome expression in human blood cells with emphasis on stress and repair related process.
# Chips
# Chip ID: OAS12-1 - Annotation by Bgee curators: Anatomical structure ID: EV:0100047 - Developmental stage ID: HsapDO:0000148
# Probeset ID: 203961_at - Mapped to gene: ENSG00000078114 - Detection flag: absent - Quality: high quality
We also denoise and shrink the consistent effects away from 0.
plot_tc(lfsr,curvedata = maxz,thresh = 0.00005)
plot_tc(lfsr,curvedata = posterior.means,thresh = 0.00005)
Power comparison
sum(lfsr<0.05)
## [1] 393414
p.vals=t(apply(z.stat,1,function(x){2*(1-pnorm(abs(x)))}))
hist(p.vals,freq=FALSE)
abline(h=1)
library('qvalue')
qs=qvalue(p.vals)
sum(qs$qvalues<0.05)
## [1] 202087
We can also ask for how many genes do we observe at least one significant tissue:
numsig=apply(lfsr,1,function(x){sum(x<0.05)})
sum(numsig!=0)
## [1] 14146
WHile using the qvalues,
numsigsumstat=apply(qs$qvalue,1,function(x){sum(x<0.05)})
sum(numsigsumstat!=0)
## [1] 16069
So we improve the ability to detect tissue-level effects by sharing information across tissues and genes to augment `modest’ effects in any one tissue in the presence of larger effects in other tissues for a given gene snp pair.
lmat=lfsr<0.05
qmat=qs$qvalue<0.05
lbetter=sum(qmat=="FALSE"&lmat=="TRUE")
qbetter=sum(lmat=="FALSE"&qmat=="TRUE")
In fact, we find that 199976 of the time, the lfsr identifies a signficiant tissue effect when the qvalue does not, while only 8649 of the time does the qvalue identify associations which our method would not call.
Here’s an example in which the effect in Adipose Visceral Omentum and Adrenal Gland is augmented by large effects in other tissues:
j=2
barfunc(j)
However, the overall number of genes detected does not imporve, because in those situation, at the gene level we probably would already have called the gene. We might expect this result, because using univariate statistics in isolation of other gene-snp pairs and in insolation of other tissues does not allow us to adjust our belief in consistently small effects. Furthermore, if a snp demonstrates a tissue specific effect that is not present at high levels in the overall data-set, the tissue specific effect might be reduced, consistent with our suspicion at the unusual pattern. Thus even though our method
Example:
j=which(numsig==0)[1]
barfunc(j)
Now, for each QTL I want to examine the posterior means of the tissues in which it was ‘active’ (at an lfsr threshold) and find QTL whose sign differed in at least 2 significant QTL. We restrict our analysis to only those gene snp pairs who show significance in at least one tissue:
l=apply(lfsr,1,function(x){sum(x<0.05)})
sum(l!=0)
## [1] 14146
Here we can see that is `r sum(l!=0)’ genes. Then we look to see if the posterior means differed in sign.
Now, for each QTL I want to examine the posterior means of the tissues in which it was ‘active’ (at an lfsr threshold) and find QTL whose sign differed in at least 2 significant QTL.
thresh=0.05
s=sapply(seq(1:nrow(posterior.means)),function(x){
l=lfsr[x,];p=posterior.means[x,];plow=p[which(l<thresh)];##grab only those posterior means that are 'significant'
if(length(plow)==0){return("FALSE")}##for ones who show no significants, they can't be heterogenous
else{pos=sum(plow>0);neg=sum(plow<0);pos*neg!=0}})
sum(s=="TRUE")
## [1] 3180
hets=which(s=="TRUE")
Now, for each snp, simulate 100 draws from mvnorm according to posterior weights; count proportion of times signs differ. This takes about 2 hours .
heterogeneity=sapply(seq(1:nrow(maxz)),function(j){sim.array.generation(j,b.j.hat=maxz,s.j,covmat,pis,sim=100)})
write.table(heterogeneity,"heterogeneityindex.txt",row.names=rownames(maxz))
We can examine the distribution of heterogeneity.
heterogeneity=read.table("../Data/het.tablewithED.txt")[,1]
hist(heterogeneity,main="Proportion of Simulations with H0 False",freq=FALSE,xlab="Proportion of Simulations with Sign Heterogeneity")
sum(heterogeneity)
## [1] 10337.48
If we compare with original z statistics:
zhets=apply(z.stat,1,function(dat){pos=sum(dat>0);neg=sum(dat<0);pos*neg!=0})
sum(zhets)
## [1] 14557